#!/usr/bin/python3

# Using Multiprocessing to Make Python Code Faster
# see: https://medium.com/@urban_institute/using-multiprocessing-to-make-python-code-faster-23ea5ef996ba


# Convert files from the format generated by pandas (/ai-research-keyphrase-extraction/tweet_process.py) into files with an array of tweets
# Receives a directory as a parameter, and processes all files in that directory

import argparse
from datetime import datetime
import json
import re
import os
import sys
from os import listdir
from os.path import isfile, join
import multiprocessing

# Number of simultaneous processes
N_PROC = 24

parser = argparse.ArgumentParser()
parser.add_argument("-d", "--directory", required=True, help="file directory (required)")
parser.add_argument("-o", "--overwrite", required=False, action='store_true', help="crush existing TW files (optional)")
args = parser.parse_args()
directory = args.directory
overwrite = args.overwrite

#if overwrite:
#	print("overwrite")
#else:
#	print("no overwrite")
#quit()

#onlyfiles = [f for f in listdir(directory) if isfile(join(directory, f))]
if overwrite:
	onlyfiles = [f for f in listdir(directory) if isfile(join(directory, f)) and (re.search(r"jsonl?.out$", f) is not None)]
else:
	onlyfiles = [f for f in listdir(directory) if isfile(join(directory, f)) and (re.search(r"jsonl?.out$", f) is not None) and not isfile(join(directory, f + ".tw"))]
onlyfiles.sort()
files = onlyfiles[:]
dfiles = [directory + "/" + s for s in files]

#print(dfiles); quit()

def multiprocessing_func(f):
	ifname = f
	ofname = ifname + ".tw"

	with open(ifname, 'r') as file:
		data = file.read()
	data = json.loads(data)

	# get all keys
	keys = []
	for idx, val in enumerate(data):
		keys.append(val)

	# get number of tweets
	first_key = keys[0]
	number = len(data[first_key])

	# Convert from "pandas" format to "list of tweets" format
	# Create list of tweets
	twlist = []
	for i in range(number):
		tw = {}
		for key in keys:
			tw[key] = data[key][str(i)]
			if key == "created_at":
				t = int(tw[key]/1000)
				tw[key] = datetime.fromtimestamp(t).isoformat()
				
		twlist.append(tw)
	
	# Corrections to tweets
	for tw in twlist:
		tw['id_str'] = str(tw['id_str'])
		
		if tw['text']:
			tw['original_field'] = 'text'
		elif tw['qtext']:
			tw['original_field'] = 'qtext'
		elif tw['rtext']:
			tw['original_field'] = 'rtext'
		else:
			tw['original_field'] = 'none'
		
		kp = tw['original_field'] + '.keyphrases'
		if tw['original_field']=='none' or len(tw[kp])==0:
			tw['keyphrase_text'] = []
			tw['keyphrase_weight'] = []
		else:
			tw['keyphrase_text'] = tw[kp][0]
			tw['keyphrase_weight'] = tw[kp][1]
		
		if tw['original_field']=='none':
			tw['text'] = ''
		else:
			tw['text'] = tw[tw['original_field']]
		tw['clean_text'] = tw['clean_' + tw['original_field']]
		tw['emoji'] = tw[tw['original_field'] + '_emoji']
		tw['hashtag'] = tw[tw['original_field'] + '_hashtag']
		tw['mention'] = tw[tw['original_field'] + '_mention']
		tw['symbol'] = tw[tw['original_field'] + '_symbol']
		tw['url'] = tw[tw['original_field'] + '_url']
		
		tw.pop('qtext')
		tw.pop('rtext')
		tw.pop('clean_qtext')
		tw.pop('clean_rtext')
		tw.pop('text_emoji')
		tw.pop('qtext_emoji')
		tw.pop('rtext_emoji')
		tw.pop('text_hashtag')
		tw.pop('qtext_hashtag')
		tw.pop('rtext_hashtag')
		tw.pop('text_mention')
		tw.pop('qtext_mention')
		tw.pop('rtext_mention')
		tw.pop('text_symbol')
		tw.pop('qtext_symbol')
		tw.pop('rtext_symbol')
		tw.pop('text_url')
		tw.pop('qtext_url')
		tw.pop('rtext_url')
		tw.pop('text.keyphrases')
		tw.pop('rtext.keyphrases')
		tw.pop('qtext.keyphrases')

	out = open(ofname, "w")
	out.write(json.dumps(twlist))
	out.close()


if __name__ == '__main__':
        pool = multiprocessing.Pool(N_PROC)
        try:
                pool.map(multiprocessing_func, dfiles)
        except Exception as e:
                print(e, file=sys.stderr)
        pool.close()

